# Step 1 -> Read data into R workstation
# RCurl is the R package to read csv file using a link
library(RCurl)
url <- paste0("https://raw.githubusercontent.com",
              "/analyticsbook/book/main/data/AD.csv")
AD <- read.csv(text=getURL(url))
# str(AD)
# Step 2 -> Data preprocessing
# Create your X matrix (predictors) and Y vector (outcome variable)
X <- AD[,2:16]
Y <- AD$DX_bl

# The following code makes sure the variable "DX_bl" is a "factor". 
# It denotes "0" as "c0" and "1" as "c1", to highlight the fact 
# that "DX_bl" is a factor variable, not a numerical variable.

Y <- paste0("c", Y)
# as.factor is to convert any variable into the 
# format as "factor" variable.
Y <- as.factor(Y) 

# Then, we integrate everything into a data frame
data <- data.frame(X,Y)
names(data)[16] = c("DX_bl")

set.seed(1) # generate the same random sequence
# Create a training data (half the original data size)
train.ix <- sample(nrow(data),floor( nrow(data)/2) )
data.train <- data[train.ix,]
# Create a testing data (half the original data size)
data.test <- data[-train.ix,]
# Step 3 -> Use glm() function to build a full model 
# with all predictors
logit.AD.full <- glm(DX_bl~., data = data.train,
                     family = "binomial")
summary(logit.AD.full)
## 
## Call:
## glm(formula = DX_bl ~ ., family = "binomial", data = data.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4250  -0.3645  -0.0704   0.2074   3.1707  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  43.97098    7.83797   5.610 2.02e-08 ***
## AGE          -0.07304    0.03875  -1.885  0.05945 .  
## PTGENDER      0.48668    0.46682   1.043  0.29716    
## PTEDUCAT     -0.24907    0.08714  -2.858  0.00426 ** 
## FDG          -3.28887    0.59927  -5.488 4.06e-08 ***
## AV45          2.09311    1.36020   1.539  0.12385    
## HippoNV     -38.03422    6.16738  -6.167 6.96e-10 ***
## e2_1          0.90115    0.85564   1.053  0.29225    
## e4_1          0.56917    0.54502   1.044  0.29634    
## rs3818361    -0.47249    0.45309  -1.043  0.29703    
## rs744373      0.02681    0.44235   0.061  0.95166    
## rs11136000   -0.31382    0.46274  -0.678  0.49766    
## rs610932      0.55388    0.49832   1.112  0.26635    
## rs3851179    -0.18635    0.44872  -0.415  0.67793    
## rs3764650    -0.48152    0.54982  -0.876  0.38115    
## rs3865444     0.74252    0.45761   1.623  0.10467    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 349.42  on 257  degrees of freedom
## Residual deviance: 139.58  on 242  degrees of freedom
## AIC: 171.58
## 
## Number of Fisher Scoring iterations: 7
# Step 4 -> use step() to automatically delete 
# all the insignificant
# variables
# Also means, automatic model selection
logit.AD.reduced <- step(logit.AD.full, direction="both",
                         trace = 0)
summary(logit.AD.reduced)
## 
## Call:
## glm(formula = DX_bl ~ AGE + PTEDUCAT + FDG + AV45 + HippoNV + 
##     rs3865444, family = "binomial", data = data.train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.38957  -0.42407  -0.09268   0.25092   2.73658  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  42.68795    7.07058   6.037 1.57e-09 ***
## AGE          -0.07993    0.03650  -2.190  0.02853 *  
## PTEDUCAT     -0.22195    0.08242  -2.693  0.00708 ** 
## FDG          -3.16994    0.55129  -5.750 8.92e-09 ***
## AV45          2.62670    1.18420   2.218  0.02655 *  
## HippoNV     -36.22215    5.53083  -6.549 5.79e-11 ***
## rs3865444     0.71373    0.44290   1.612  0.10707    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 349.42  on 257  degrees of freedom
## Residual deviance: 144.62  on 251  degrees of freedom
## AIC: 158.62
## 
## Number of Fisher Scoring iterations: 7
# Step 4 continued
anova(logit.AD.reduced,logit.AD.full,test = "LRT")
## Analysis of Deviance Table
## 
## Model 1: DX_bl ~ AGE + PTEDUCAT + FDG + AV45 + HippoNV + rs3865444
## Model 2: DX_bl ~ AGE + PTGENDER + PTEDUCAT + FDG + AV45 + HippoNV + e2_1 + 
##     e4_1 + rs3818361 + rs744373 + rs11136000 + rs610932 + rs3851179 + 
##     rs3764650 + rs3865444
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       251     144.62                     
## 2       242     139.58  9   5.0438   0.8305
# The argument, test = "LRT", means that the p-value 
# is derived via the Likelihood Ratio Test (LRT).
# Step 5 -> test the significance of the logistic model
# Test residual deviance for lack-of-fit 
# (if > 0.10, little-to-no lack-of-fit)
dev.p.val <- 1 - pchisq(logit.AD.reduced$deviance,
                        logit.AD.reduced$df.residual)
dev.p.val
## [1] 1
# Step 6 -> Predict on test data using your 
# logistic regression model
y_hat <- predict(logit.AD.reduced, data.test)
# Step 7 -> Evaluate the prediction performance of 
# your logistic regression model
# (1) Three main metrics for classification: Accuracy, 
# Sensitivity (1- False Positive), 
# Specificity (1 - False Negative)
y_hat2 <- y_hat
y_hat2[which(y_hat > 0)] = "c1" 
# Since y_hat here is the values from the linear equation 
# part of the logistic regression model, by default, 
# we should use 0 as a cut-off value (only by default, 
# not optimal though), i.e., if y_hat < 0, we name it 
# as one class, and if y_hat > 0, it is another class.
y_hat2[which(y_hat < 0)] = "c0"

library(caret) 
## Loading required package: lattice
## Loading required package: ggplot2
# confusionMatrix() in the package "caret" is a powerful 
# function to summarize the prediction performance of a 
# classification model, reporting metrics such as Accuracy, 
# Sensitivity (1- False Positive), 
# Specificity (1 - False Negative), to name a few.
library(e1071)
confusionMatrix(table(y_hat2, data.test$DX_bl))
## Confusion Matrix and Statistics
## 
##       
## y_hat2  c0  c1
##     c0 117  29
##     c1  16  97
##                                           
##                Accuracy : 0.8263          
##                  95% CI : (0.7745, 0.8704)
##     No Information Rate : 0.5135          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.6513          
##                                           
##  Mcnemar's Test P-Value : 0.07364         
##                                           
##             Sensitivity : 0.8797          
##             Specificity : 0.7698          
##          Pos Pred Value : 0.8014          
##          Neg Pred Value : 0.8584          
##              Prevalence : 0.5135          
##          Detection Rate : 0.4517          
##    Detection Prevalence : 0.5637          
##       Balanced Accuracy : 0.8248          
##                                           
##        'Positive' Class : c0              
## 
# (2) ROC curve is another commonly reported metric for 
# classification models
library(pROC) 
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
# pROC has the roc() function that is very useful here
plot(roc(data.test$DX_bl, y_hat),
     col="blue", main="ROC Curve")
## Setting levels: control = c0, case = c1
## Setting direction: controls < cases

## coefficients and 95% CI
cbind(coef = coef(logit.AD.reduced), confint(logit.AD.reduced))
## Waiting for profiling to be done...
##                     coef       2.5 %       97.5 %
## (Intercept)  42.68794758  29.9745022  57.88659748
## AGE          -0.07993473  -0.1547680  -0.01059348
## PTEDUCAT     -0.22195425  -0.3905105  -0.06537066
## FDG          -3.16994212  -4.3519800  -2.17636447
## AV45          2.62670085   0.3736259   5.04703489
## HippoNV     -36.22214822 -48.1671093 -26.35100122
## rs3865444     0.71373441  -0.1348687   1.61273264
# Remark: how to obtain the 95% CI of the predictions
y_hat <- predict(logit.AD.reduced, data.test, type = "link", 
                 se.fit = TRUE) 
# se.fit = TRUE, is to get the standard error in the predictions, 
# which is necessary information for us to construct 
# the 95% CI of the predictions
data.test$fit    <- y_hat$fit
data.test$se.fit <- y_hat$se.fit
# We can readily convert this information into the 95\% CIs 
# of the predictions (the way these 95\% CIs are 
# derived are again, only in approximated sense).
# CI for fitted values
data.test <- within(data.test, {
# added "fitted" to make predictions at appended temp values
fitted    = exp(fit) / (1 + exp(fit))
fit.lower = exp(fit - 1.96 * se.fit) / (1 + 
                                          exp(fit - 1.96 * se.fit))
fit.upper = exp(fit + 1.96 * se.fit) / (1 + 
                                          exp(fit + 1.96 * se.fit))
})
## odds ratios and 95% CI
exp(cbind(OR = coef(logit.AD.reduced), 
          confint(logit.AD.reduced)))
## Waiting for profiling to be done...
##                       OR        2.5 %       97.5 %
## (Intercept) 3.460510e+18 1.041744e+13 1.379844e+25
## AGE         9.231766e-01 8.566139e-01 9.894624e-01
## PTEDUCAT    8.009520e-01 6.767113e-01 9.367202e-01
## FDG         4.200603e-02 1.288128e-02 1.134532e-01
## AV45        1.382807e+01 1.452993e+00 1.555605e+02
## HippoNV     1.857466e-16 1.205842e-21 3.596711e-12
## rs3865444   2.041601e+00 8.738306e-01 5.016501e+00
# Fit a logistic regression model with FDG
logit.AD.FDG <- glm(DX_bl ~ FDG, data = AD, family = "binomial")
summary(logit.AD.FDG)
## 
## Call:
## glm(formula = DX_bl ~ FDG, family = "binomial", data = AD)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4686  -0.8166  -0.2758   0.7679   2.7812  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  18.3300     1.7676   10.37   <2e-16 ***
## FDG          -2.9370     0.2798  -10.50   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 711.27  on 516  degrees of freedom
## Residual deviance: 499.00  on 515  degrees of freedom
## AIC: 503
## 
## Number of Fisher Scoring iterations: 5
logit.AD.FDG <- glm(DX_bl ~   FDG, data = data.train,
                    family = "binomial")
y_hat <- predict(logit.AD.FDG, data.test, type = "link",
                 se.fit = TRUE)
data.test$fit    <- y_hat$fit
data.test$se.fit <- y_hat$se.fit

# CI for fitted values
data.test <- within(data.test, {
# added "fitted" to make predictions at appended temp values
  fitted    = exp(fit) / (1 + exp(fit))
  fit.lower = exp(fit - 1.96 * se.fit) / (1 + exp(fit - 1.96 *
                                                    se.fit))
  fit.upper = exp(fit + 1.96 * se.fit) / (1 + exp(fit + 1.96 *
                                                    se.fit))
})
library(ggplot2)
newData <- data.test[order(data.test$FDG),]
newData$DX_bl = as.numeric(newData$DX_bl)
newData$DX_bl[which(newData$DX_bl==1)] = 0
newData$DX_bl[which(newData$DX_bl==2)] = 1
newData$DX_bl = as.numeric(newData$DX_bl)
p <- ggplot(newData, aes(x = FDG, y = DX_bl))
# predicted curve and point-wise 95\% CI
p <- p + geom_ribbon(aes(x = FDG, ymin = fit.lower,
                         ymax = fit.upper), alpha = 0.2)
p <- p + geom_line(aes(x = FDG, y = fitted), colour="red")
# fitted values
p <- p + geom_point(aes(y = fitted), size=2, colour="red")
# observed values
p <- p + geom_point(size = 2)
p <- p + ylab("Probability") + theme(text = element_text(size=18))
p <- p + labs(title =
                "Observed and predicted probability of disease")
print(p)

# install.packages("reshape2")
require(reshape2)
## Loading required package: reshape2
data.train$ID <- c(1:dim(data.train)[1])
AD.long <- melt(data.train[,c(1,3,4,5,6,16,17)],
                id.vars = c("ID", "DX_bl"))
# Plot the data using ggplot
require(ggplot2)
p <- ggplot(AD.long, aes(x = factor(DX_bl), y = value))
# boxplot, size=.75 to stand out behind CI
p <- p + geom_boxplot(size = 0.75, alpha = 0.5)
# points for observed data
p <- p + geom_point(position = position_jitter(w = 0.05, h = 0),
                    alpha = 0.1)
# diamond at mean for each group
p <- p + stat_summary(fun = mean, geom = "point", shape = 18,
                      size = 6, alpha = 0.75, colour = "red")
# confidence limits based on normal distribution
p <- p + stat_summary(fun.data = "mean_cl_normal",
                      geom = "errorbar", width = .2, alpha = 0.8)
p <- p + facet_wrap(~ variable, scales = "free_y", ncol = 3)
p <- p + labs(title =
    "Boxplots of variables by diagnosis (0 - normal; 1 - patient)")
print(p)

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
## 
##     smiths
## The following object is masked from 'package:RCurl':
## 
##     complete
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(ggplot2)

theme_set(theme_gray(base_size = 15) )

# define monitoring function. data0: reference data;
# data.real.time: real-time data; wsz: window size
Monitoring <- function( data0, data.real.time, wsz ){
num.data.points <- nrow(data.real.time)
stat.mat <- NULL
importance.mat <- NULL

for( i in 1:num.data.points  ){
# at the start of monitoring, when real-time data size is 
# smaller than the window size, combine the real-time
# data points and random samples from the reference data
# to form a data set of wsz
if(i<wsz){
# sample.size.from.reference (ssfr)
  ssfr <- wsz - i
  sample.reference <- data0[sample(nrow(data0),
                                 ssfr,replace = TRUE), ]
  current.real.time.data <- rbind(sample.reference,
                            data.real.time[1:i,,drop=FALSE])
}else{
  current.real.time.data <-  data.real.time[(i-wsz+
                                     1):i,,drop=FALSE]
}
current.real.time.data$class <- 1
data <- rbind( data0, current.real.time.data )
colnames(data) <- c(paste0("X",1:(ncol(data)-1)),
                    "Class")
data$Class <- as.factor(data$Class)

# apply random forests to the data
my.rf <- randomForest(Class ~ .,sampsize=c(wsz,wsz), data=data)

# get importance score
importance.mat <- rbind(importance.mat, t(my.rf$importance))
# get monitoring statistics
ooblist <- my.rf[5]
oobcolumn=matrix(c(ooblist[[1]]),2:3)
ooberrornormal= (oobcolumn[,3])[1]
ooberrorabnormal=(oobcolumn[,3])[2]

temp=my.rf[6]
p1vote <- mean(temp$votes[,2][(nrow(data0)+1):nrow(data)])

this.stat <- c(ooberrornormal,ooberrorabnormal,p1vote)
stat.mat <- rbind(stat.mat, this.stat)
}
result <- list(importance.mat = importance.mat,
               stat.mat = stat.mat)
return(result)
}
# data generation
# sizes of reference data, real-time data without change, 
# and real-time data with changes
length0 <- 100
length1 <- 100
length2 <- 100

# 2-dimension
dimension <- 2

# reference data
data0 <- rnorm( dimension * length0, mean = 0, sd = 1)
# real-time data with no change
data1 <- rnorm( dimension * length2, mean = 0, sd = 1)
# real-time data different from the reference data in the 
# second the variable
data2 <- cbind( V1 = rnorm( 1 * length1, mean = 0, sd = 1), 
                V2 = rnorm( 1 * length1, mean = 2, sd = 1) )

# convert to data frame
data0 <- matrix(data0, nrow = length0, byrow = TRUE) %>% 
  as.data.frame()
data1 <- matrix(data1, nrow = length2, byrow = TRUE) %>% 
  as.data.frame()
data2 <- data2 %>% as.data.frame()

# assign variable names
colnames( data0 ) <- paste0("X",1:ncol(data0))
colnames( data1 ) <- paste0("X",1:ncol(data1))
colnames( data2 ) <- paste0("X",1:ncol(data2))

# assign reference data with class 0 and real-time data with class 1
data0 <- data0 %>% mutate(class = 0)
data1 <- data1 %>% mutate(class = 1)
data2 <- data2 %>% mutate(class = 1)

# real-time data consists of normal data and abnormal data
data.real.time <- rbind(data1,data2)
data.plot <- rbind( data0, data1 ) %>% mutate(class = factor(class))
ggplot(data.plot, aes(x=X1, y=X2, shape = class, color=class)) + 
  geom_point(size=3)

data.plot <- rbind( data0, data2 ) %>% mutate(class = factor(class))
ggplot(data.plot, aes(x=X1, y=X2, shape = class,
                      color=class)) + geom_point(size=3)

wsz <- 10
result <- Monitoring( data0, data.real.time, wsz )
stat.mat <- result$stat.mat
importance.mat <- result$importance.mat

# plot different monitor statistics
stat.mat <- data.frame(stat.mat)
stat.mat$id <- 1:nrow(stat.mat)
colnames(stat.mat) <- c("error0","error1","prob","id")
stat.mat <- stat.mat %>% gather(type, statistics, error0,
                                error1,prob)
ggplot(stat.mat,aes(x=id,y=statistics,color=type)) + 
  geom_line(linetype = "dashed") + geom_point() +
  geom_point(size=2)

# plot importance scores for diagnosis
importance.mat <- data.frame(importance.mat)
importance.mat$id <- 1:nrow(importance.mat)
colnames(importance.mat) <- c("X1","X2","id")
importance.mat <- importance.mat %>% 
  gather(variable, importance,X1,X2)
ggplot(importance.mat,aes(x=id,y=importance,
    color=variable)) + geom_line(linetype = "dashed") +
    geom_point(size=2)

# 10-dimensions, with 2 variables being changed from 
# the normal condition
dimension <- 10
wsz <- 5
# reference data
data0 <- rnorm( dimension * length0, mean = 0, sd = 1)
# real-time data with no change
data1 <- rnorm( dimension * length1, mean = 0, sd = 1)
# real-time data different from the reference data in the 
# second the variable
data2 <- c( rnorm( (dimension - 2) * length2, mean = 0, sd = 1), 
            rnorm( (2) * length2, mean = 20, sd = 1))


# convert to data frame
data0 <- matrix(data0, nrow = length0, byrow = TRUE) %>% 
  as.data.frame()
data1 <- matrix(data1, nrow = length1, byrow = TRUE) %>% 
  as.data.frame()
data2 <- matrix(data2, ncol = 10, byrow = FALSE) %>% 
  as.data.frame()

# assign reference data with class 0 and real-time data 
# with class 1
data0 <- data0 %>% mutate(class = 0)
data1 <- data1 %>% mutate(class = 1)
data2 <- data2 %>% mutate(class = 1)

# real-time data consists of normal data and abnormal data
data.real.time <- rbind(data1,data2)
result <- Monitoring( data0, data.real.time, wsz )
stat.mat <- result$stat.mat
importance.mat <- result$importance.mat

# plot different monitor statistics
stat.mat <- data.frame(stat.mat)
stat.mat$id <- 1:nrow(stat.mat)
colnames(stat.mat) <- c("error0","error1","prob","id")
stat.mat <- stat.mat %>% gather(type, statistics, error0,
                                error1,prob)
ggplot(stat.mat,aes(x=id,y=statistics,color=type))+
  geom_line(linetype = "dashed") + geom_point() +
                                      geom_point(size=2)

# plot importance scores for diagnosis
importance.mat <- data.frame(importance.mat)
importance.mat$id <- 1:nrow(importance.mat)
# colnames(importance.mat) <- c("X1","X2","id")
importance.mat <- importance.mat %>% 
  gather(variable, importance,X1:X10)
importance.mat$variable <- factor( importance.mat$variable,
                                   levels = paste0( "X", 1:10))
# levels(importance.mat$variable) <- paste0( "X", 1:10  )
ggplot(importance.mat,aes(x=id,y=importance,color=
          variable)) + geom_line(linetype = "dashed") +
          geom_point(size=2)

# Create the frequency table in accordance of categorization
# of HippoNV
temp = quantile(AD$HippoNV,seq(from = 0.05, to = 0.95,
                                          by = 0.05))
AD$HippoNV.category <- cut(AD$HippoNV, breaks=c(-Inf,
                                              temp, Inf))
tempData <- data.frame(xtabs(~DX_bl + HippoNV.category, 
                              data = AD))
tempData <- tempData[seq(from = 2, to = 
                   2*length(unique(AD$HippoNV.category)), 
                   by = 2),]
summary(xtabs(~DX_bl + HippoNV.category, data = AD))
## Call: xtabs(formula = ~DX_bl + HippoNV.category, data = AD)
## Number of cases in table: 517 
## Number of factors: 2 
## Test for independence of all factors:
##  Chisq = 229.23, df = 19, p-value = 4.822e-38
tempData$Total <- colSums(as.matrix(xtabs(~DX_bl +
                    HippoNV.category,data = AD)))
tempData$p.hat <- 1 - tempData$Freq/tempData$Total
tempData$HippoNV.category = as.numeric(tempData$HippoNV.category)
str(tempData)
## 'data.frame':    20 obs. of  5 variables:
##  $ DX_bl           : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ HippoNV.category: num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Freq            : int  24 25 25 21 22 15 17 17 19 11 ...
##  $ Total           : num  26 26 26 26 26 25 26 26 26 34 ...
##  $ p.hat           : num  0.0769 0.0385 0.0385 0.1923 0.1538 ...
# Draw the scatterplot of HippoNV.category 
# versus the probability of normal
library(ggplot2)
p <- ggplot(tempData, aes(x = HippoNV.category, y = p.hat))
p <- p + geom_point(size=3)
p <- p + geom_smooth(method = "loess")
p <- p + labs(title ="Empirically observed probability of normal"
              , xlab = "HippoNV")
print(p)
## `geom_smooth()` using formula 'y ~ x'

require(rpart)
## Loading required package: rpart
ndata <- 2000
X1 <- runif(ndata, min = 0, max = 1)
X2 <- runif(ndata, min = 0, max = 1)
data <- data.frame(X1,X2)
data <- data %>% mutate( X12 = 0.5 * (X1 - X2), Y =
                           ifelse(X12>=0,1,0))
ix <- which( abs(data$X12) <= 0.05)
data$Y[ix] <- ifelse(runif( length(ix)) < 0.5, 0, 1)
data <- data  %>% select(-X12) %>%  mutate(Y =
                          as.factor(as.character(Y)))
ggplot(data,aes(x=X1,y=X2,color=Y))+geom_point()

linear_model <- glm(Y ~  ., family = binomial(link = "logit"),
                                                  data = data)
tree_model <- rpart( Y ~  ., data = data)
pred_linear <- predict(linear_model, data,type="response")
pred_tree <- predict(tree_model, data,type="prob")[,1]
data_pred <- data %>% mutate(pred_linear_class =
    ifelse(pred_linear <0.5,0,1)) %>%mutate(pred_linear_class =
    as.factor(as.character(pred_linear_class)))%>%
    mutate(pred_tree_class = ifelse( pred_tree <0.5,0,1)) %>%
    mutate( pred_tree_class =
                      as.factor(as.character(pred_tree_class)))
ggplot(data_pred,aes(x=X1,y=X2,color=pred_linear_class))+
  geom_point()

ggplot(data_pred,aes(x=X1,y=X2,color=pred_tree_class))+
  geom_point()

# AGE, PTGENDER and PTEDUCAT are used as the 
# predictor variables. 
# MMSCORE (a numeric value) is the outcome.

# Step 1: read data into R
url <- paste0("https://raw.githubusercontent.com",
              "/analyticsbook/book/main/data/AD.csv")
AD <- read.csv(text=getURL(url))
# Step 2: data preprocessing
X <- AD[,2:16]
Y <- AD$MMSCORE
data <- data.frame(X,Y)
names(data)[16] <- c("MMSCORE")

# Create a training data (half the original data size)
train.ix <- sample(nrow(data),floor( nrow(data)/2) )
data.train <- data[train.ix,]
# Create a testing data (half the original data size)
data.test <- data[-train.ix,]

# Step 3: build the tree
# for regression problems, use method="anova"
tree_reg <- rpart( MMSCORE ~  ., data.train, method="anova") 

# Step 4: draw the tree
require(rpart.plot)
## Loading required package: rpart.plot
prp(tree_reg, nn.cex=1)

# Step 5 -> prune the tree
tree_reg <- prune(tree_reg,cp=0.03)
prp(tree_reg,nn.cex=1)

# Step 6 -> Predict using your tree model
pred.tree <- predict(tree_reg, data.test)
cor(pred.tree, data.test$MMSCORE)
## [1] 0.3182434
#For regression model, you can use correlation 
# to measure how close are your predictions 
# with the true outcome values of the data points